<a href="https://colab.research.google.com/github/alexoliveros92/PCA_UMAP/blob/main/PCA_UMAP_Visualization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install umap-learn[plot]
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import plotly.graph_objs as go
import plotly.figure_factory as ff
import umap

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting umap-learn[plot]
  Downloading umap-learn-0.5.3.tar.gz (88 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m88.2/88.2 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pynndescent>=0.5 (from umap-learn[plot])
  Downloading pynndescent-0.5.10.tar.gz (1.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m23.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting datashader (from umap-learn[plot])
  Downloading datashader-0.14.4-py2.py3-none-any.whl (18.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m18.2/18.2 MB[0m [31m44.0 MB/s[0m eta [36m0:00:00[0m
Collecting datashape (from datashader->umap-learn[plot])
  Downloading datashape-0.5.2.tar.gz (76 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━

In [3]:
# Importing dataset and examining it
dataset = pd.read_csv("/content/Municipalities.csv")
print(dataset.head())
print(dataset.shape)
print(dataset.info())
print(dataset.describe())


           tc        area    pop        pden          wden  urb      paper  \
0  502.250000  283.750000  22648   79.816742  119671.47660    2   4.363508   
1  228.050003   34.439999   4952  143.786301   70030.98438    3   9.887817   
2  268.010010   26.620001   3895  146.318558   81116.52344    3  11.991079   
3  199.089996   84.300003   7140   84.697502   43320.46094    3   9.762878   
4  233.639999   35.700001  12193  341.540619  201565.26560    2   6.601569   

      glass     metal   plastic      msw_so    msw_un         msw    sor  geo  \
0  3.592508  0.462317  1.131815  20396261.0  13560520  33956781.0  60.07    3   
1  9.518352  1.860965  4.643623   1831407.0    580460   2411867.0  75.93    3   
2  6.653014  0.744725  5.224834   1694922.0    464400   2159322.0  78.49    3   
3  7.551381  0.746540  5.202531   2881055.0    770860   3651915.0  78.89    3   
4  4.334883  0.103101  5.120555   3026700.0   4169180   7195880.0  42.06    1   

   roads  s_wteregio  s_landfill  
0  285.0 

In [4]:
# Dividing dataset into label and feature sets
X = dataset.drop(['urb'], axis = 1) # Features
Y = dataset['urb'] # Labels
print(type(X))
print(type(Y))
print(X.shape)
print(Y.shape)

<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.series.Series'>
(3642, 17)
(3642,)


In [5]:
# Normalizing numerical features so that each feature has mean 0 and variance 1
feature_scaler = StandardScaler()
X_scaled = feature_scaler.fit_transform(X)

In [6]:
# Implementing PCA to visualize dataset
pca = PCA(n_components=2)
pca.fit(X_scaled)
x_pca = pca.transform(X_scaled)
print("Variance explained by each of the n_components: ",pca.explained_variance_ratio_)
print("Total variance explained by the n_components: ",sum(pca.explained_variance_ratio_))
urb=list(dataset['urb'])
data = [go.Scatter(x=x_pca[:,0], y=x_pca[:,1], mode='markers',
                    marker = dict(color=Y, colorscale='Rainbow', opacity=0.5),
                                text=[f'urb: {a}' for a in urb],
                                hoverinfo='text')]

layout = go.Layout(title = 'PCA Dimensionality Reduction', width = 700, height = 700,
                    xaxis = dict(title='First Principal Component'),
                    yaxis = dict(title='Second Principal Component'))
fig = go.Figure(data=data, layout=layout)
fig.show()

Variance explained by each of the n_components:  [0.27823138 0.16274474]
Total variance explained by the n_components:  0.4409761187367872


In [13]:
# Implementing UMAP to visualize dataset - coloring by urb index
u = umap.UMAP(n_components=2, n_neighbors=5, min_dist=0.05)
x_umap = u.fit_transform(X_scaled)

data = [go.Scatter(x=x_umap[:,0], y=x_umap[:,1], mode='markers',
                    marker = dict(color=Y, colorscale='Rainbow', opacity=0.5),
                                text=[f'urb: {a}' for a in urb],
                                hoverinfo='text')]

layout = go.Layout(title = 'UMAP Dimensionality Reduction', width = 700, height = 700,
                    xaxis = dict(title='First Dimension'),
                    yaxis = dict(title='Second Dimension'))
fig = go.Figure(data=data, layout=layout)
fig.show()