# Principal Component Analysis
In this notebook the Principal Component Analysis is used for dimensionality reduction of the data.

In [1]:
import sys
import os
sys.path.append('../')
from src import reader as r
from src import visualization as v
from src import metrics as m

In [2]:
import numpy as np
import pandas as pd
import sklearn
print(sklearn.__version__)
from sklearn.cluster import KMeans
from sklearn.cluster import DBSCAN
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

0.20.0


In [3]:
X = r.readWord2Vec()
print(X[:10])
print(X.shape)

['word2vec.csv', 'health.txt', 'bags.csv']
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
(13229, 1203)


## Testing for Different Variances
### Keeping variance as 95%

In [4]:
pca = PCA(n_components = 0.95, svd_solver='full')
pca_result = pca.fit_transform(X)

The number of dimensions went from 1203 to 989

In [5]:
print(pca_result.shape)

(13229, 989)


In [6]:
best_K = 32
print("#############################")
print("Best K =", best_K)
print("Applying K-means")
best_cluster = KMeans(n_clusters=best_K, n_jobs=-1)
best_cluster_result = best_cluster.fit(pca_result)
y_pred = best_cluster_result.labels_
print("Finished")
print("#############################") 

#############################
Best K = 32
Applying K-means
Finished
#############################


In [7]:
m.applyAllMetrics(pca_result,y_pred)

Davies Bouldin
4.23885799453555


  score = (intra_dists[:, None] + intra_dists) / centroid_distances


Calinski and Harabaz
47.90113481004311
Silhouette Score
0.03560216398309494


### Keeping variance of 90%

In [8]:
pca = PCA(n_components = 0.90, svd_solver='full')
pca_result = pca.fit_transform(X)

In [9]:
print(pca_result.shape)

(13229, 848)


In [10]:
best_K = 32
print("#############################")
print("Best K =", best_K)
print("Applying K-means")
best_cluster = KMeans(n_clusters=best_K, n_jobs=-1)
best_cluster_result = best_cluster.fit(pca_result)
y_pred = best_cluster_result.labels_
print("Finished")
print("#############################") 

#############################
Best K = 32
Applying K-means
Finished
#############################


In [11]:
m.applyAllMetrics(pca_result,y_pred)

  score = (intra_dists[:, None] + intra_dists) / centroid_distances


Davies Bouldin
4.342050681060139
Calinski and Harabaz
44.772361700890706
Silhouette Score
0.030814891230140914


### Keeping variance of 85%

In [12]:
pca = PCA(n_components = 0.85, svd_solver='full')
pca_result = pca.fit_transform(X)

In [13]:
print(pca_result.shape)

(13229, 733)


In [14]:
best_K = 32
print("#############################")
print("Best K =", best_K)
print("Applying K-means")
best_cluster = KMeans(n_clusters=best_K, n_jobs=-1)
best_cluster_result = best_cluster.fit(pca_result)
y_pred = best_cluster_result.labels_
print("Finished")
print("#############################") 

#############################
Best K = 32
Applying K-means
Finished
#############################


In [15]:
m.applyAllMetrics(pca_result,y_pred)

Davies Bouldin
3.780977215530653


  score = (intra_dists[:, None] + intra_dists) / centroid_distances


Calinski and Harabaz
49.89050569202297
Silhouette Score
0.030154424805214766


### Keeping variance of 80%

In [16]:
pca = PCA(n_components = 0.85, svd_solver='full')
pca_result = pca.fit_transform(X)

In [17]:
print(pca_result.shape)

(13229, 733)


In [18]:
best_K = 32
print("#############################")
print("Best K =", best_K)
print("Applying K-means")
best_cluster = KMeans(n_clusters=best_K, n_jobs=-1)
best_cluster_result = best_cluster.fit(pca_result)
y_pred = best_cluster_result.labels_
print("Finished")
print("#############################") 

#############################
Best K = 32
Applying K-means
Finished
#############################


In [19]:
m.applyAllMetrics(pca_result,y_pred)

  score = (intra_dists[:, None] + intra_dists) / centroid_distances


Davies Bouldin
4.200904741126107
Calinski and Harabaz
51.97311164865916
Silhouette Score
0.032431197126351195
