In [1]:
import numpy as np
import pandas as pd

from timeit import default_timer as timer
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture
from scipy.stats import kurtosis
from sklearn.decomposition import PCA, FastICA
from sklearn.random_projection import GaussianRandomProjection
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import SelectFromModel
from timeit import default_timer as timer
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set()

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

from datasets.HiggsBosonDataset import HiggsBosonDataset
from datasets.MappingDataset import MappingDataset
%load_ext autoreload
%autoreload 2
from utils import *

In [2]:
# Import Datasets
higgs = HiggsBosonDataset()
mapping = MappingDataset()

# KMeans Clustering
---
## Higgs Boson Dataset

## Mapping Dataset

In [50]:
#Split Train data to train and validation Data
X_train, y_train = mapping.get_train_data()
X_test, y_test = mapping.get_test_data()

### PCA

In [51]:
pca = PCA(0.9)
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test) 

In [54]:
kmeans = KMeans(n_clusters=8)

t0 = timer()
kmeans.fit(X_train_pca)
time = timer() - t0
print('Fit Time: %.3f seconds' % (time))

bench_kmeans(kmeans, '\tkmeans Train Set',X_train_pca, y_train)
print()
bench_kmeans(kmeans, '\tkmeans Test Set',X_test_pca, y_test)

Fit Time: 0.702 seconds
Estimator:	kmeans Train Set
intertia	143093
Homogeneity	0.414
AMI		0.196

Estimator:	kmeans Test Set
intertia	143093
Homogeneity	0.383
AMI		0.335


### ICA

In [66]:
ica = FastICA(15,tol=0.2)
X_train_ica = ica.fit_transform(X_train)
X_test_ica = ica.transform(X_test) 

In [67]:
X_train_ica.shape

(10545, 15)

In [69]:
kmeans = KMeans(n_clusters=10)

t0 = timer()
kmeans.fit(X_train_ica)
time = timer() - t0
print('Fit Time: %.3f seconds' % (time))

bench_kmeans(kmeans, '\tkmeans Train Set',X_train_ica, y_train)
print()
bench_kmeans(kmeans, '\tkmeans Test Set',X_test_ica, y_test)

Fit Time: 1.213 seconds
Estimator:	kmeans Train Set
intertia	10
Homogeneity	0.347
AMI		0.148

Estimator:	kmeans Test Set
intertia	10
Homogeneity	0.403
AMI		0.326


### Random Projection

In [72]:
grp = GaussianRandomProjection(n_components=15)
X_train_grp = grp.fit_transform(X_train)
X_test_grp = grp.transform(X_test) 

In [75]:
kmeans = KMeans(n_clusters=9)

t0 = timer()
kmeans.fit(X_train_grp)
time = timer() - t0
print('Fit Time: %.3f seconds' % (time))

bench_kmeans(kmeans, '\tkmeans Train Set',X_train_grp, y_train)
print()
bench_kmeans(kmeans, '\tkmeans Test Set',X_test_grp, y_test)

Fit Time: 0.873 seconds
Estimator:	kmeans Train Set
intertia	143823
Homogeneity	0.355
AMI		0.158

Estimator:	kmeans Test Set
intertia	143823
Homogeneity	0.383
AMI		0.300


### Tree Model Selection

In [77]:
clf = ExtraTreesClassifier(n_estimators=100)
clf = clf.fit(X_train,y_train)
selector = SelectFromModel(clf, prefit=True)
X_train_tree = selector.transform(X_train)
X_test_tree = selector.transform(X_test)

In [79]:
kmeans = KMeans(n_clusters=8)

t0 = timer()
kmeans.fit(X_train_tree)
time = timer() - t0
print('Fit Time: %.3f seconds' % (time))

bench_kmeans(kmeans, '\tkmeans Train Set',X_train_tree, y_train)
print()
bench_kmeans(kmeans, '\tkmeans Test Set',X_test_tree, y_test)

Fit Time: 0.545 seconds
Estimator:	kmeans Train Set
intertia	35209
Homogeneity	0.403
AMI		0.193

Estimator:	kmeans Test Set
intertia	35209
Homogeneity	0.377
AMI		0.328
