In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans, AffinityPropagation, MeanShift, SpectralBiclustering, SpectralClustering, AgglomerativeClustering, DBSCAN, OPTICS, Birch

from sklearn.cluster import KMeans,MeanShift,SpectralClustering
from sklearn.mixture import GaussianMixture,BayesianGaussianMixture
from sklearn.decomposition import PCA,TruncatedSVD,NMF,FastICA,FactorAnalysis
from sklearn.cluster import SpectralCoclustering,SpectralBiclustering
from sklearn.manifold import LocallyLinearEmbedding,TSNE

from sklearn.mixture import GaussianMixture, BayesianGaussianMixture
nummercl=7
from sklearn.preprocessing import RobustScaler, PowerTransformer

from tqdm import tqdm


In [2]:
# Importing the data

data = pd.read_csv("data/data.csv")
submission = pd.read_csv("data/sample_submission.csv")
# Dropping ID column
data.drop(columns = 'id', inplace=True)
data.head()
data.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
f_00,98000.0,0.00122,1.002801,-4.732235,-0.675226,0.002022,0.677271,4.490521
f_01,98000.0,0.00558,1.000742,-4.202795,-0.670985,0.00665,0.677746,4.324974
f_02,98000.0,-0.001042,1.001373,-4.377021,-0.672779,-0.000324,0.677086,4.560247
f_03,98000.0,-0.0007,1.000422,-4.010826,-0.67254,-0.003185,0.672097,4.399373
f_04,98000.0,-0.003522,1.003061,-4.535903,-0.68251,-0.003307,0.677589,4.050549
f_05,98000.0,-0.001612,1.000532,-4.300767,-0.675066,0.001024,0.673344,4.710316
f_06,98000.0,-0.003042,0.997434,-4.894525,-0.680421,-0.002053,0.668112,3.998595
f_07,98000.0,5.545918,3.69184,0.0,3.0,5.0,8.0,32.0
f_08,98000.0,6.763061,4.152348,0.0,4.0,6.0,9.0,30.0
f_09,98000.0,8.193163,5.904919,0.0,4.0,7.0,11.0,44.0


In [3]:
# Random Sampling
#df = data.sample(1000, random_state=0)
df = data

In [5]:
# Preprocessing

from sklearn import preprocessing
from sklearn import metrics


scalers=[
    preprocessing.PowerTransformer(),
    preprocessing.StandardScaler(),
    preprocessing.Normalizer(),
    preprocessing.RobustScaler(),
    preprocessing.QuantileTransformer(output_distribution='normal')
]

scores = []
for scaler in tqdm(scalers):
    X_scaled= scaler.fit_transform(df)
    X_scaled = preprocessing.PowerTransformer().fit_transform(X_scaled)
    X_scaled = pd.DataFrame(X_scaled, columns = df.columns)


100%|██████████| 5/5 [00:17<00:00,  3.47s/it]


In [6]:
# Parameters

ncomponents = 7
ninit = 3
scores=[]

In [7]:
# Affinity Propagation, MeanShift, Spectral Clustering, Ward, Agglomerative Clustering, DBSCAN, OPTICS, BIRCH, Gaussian Mixture
autoclust = [
    BayesianGaussianMixture(n_components= ncomponents, verbose=2, random_state=0),
    GaussianMixture(n_components= ncomponents, covariance_type = 'full', n_init= ninit, random_state=0, verbose=2),
    KMeans(n_clusters= ncomponents, random_state=0, verbose=2), 
]

for gmm in tqdm(autoclust):
    preds = gmm.fit_predict(X_scaled)
    shs=metrics.silhouette_score(X_scaled, preds, metric='euclidean')
    chs=metrics.calinski_harabasz_score(X_scaled, preds)
    dbs=metrics.davies_bouldin_score(X_scaled, preds)
    
    submission['Predicted'] = 0
    if len(data) == len(X_scaled):
        submission['Predicted'] = preds
        submission.to_csv(str(gmm)+str(scaler)+'submission.csv', index=False)
    else:
        if len(scores)==0:
            mpreds=preds
            scores.append([scaler, gmm, shs, chs, dbs, 1])
        
        else:
            for mtrc in [metrics.v_measure_score]:
                scores.append([scaler, gmm, shs, chs, dbs, mtrc(mpreds, preds)])

  0%|          | 0/3 [00:00<?, ?it/s]

Initialization 0
  Iteration 10	 time lapse 7.52210s	 ll change 1459.30840
  Iteration 20	 time lapse 6.84961s	 ll change 101.93143
  Iteration 30	 time lapse 6.78439s	 ll change 34.42304
  Iteration 40	 time lapse 6.60362s	 ll change 19.85088
  Iteration 50	 time lapse 6.71364s	 ll change 17.08065
  Iteration 60	 time lapse 6.53949s	 ll change 5.23341
  Iteration 70	 time lapse 6.92409s	 ll change 25.30353
  Iteration 80	 time lapse 6.70321s	 ll change 7.61225
  Iteration 90	 time lapse 6.54769s	 ll change 1.10878
  Iteration 100	 time lapse 6.52341s	 ll change 0.30076
Initialization converged: False	 time lapse 67.71148s	 ll -1314756.46146


 33%|███▎      | 1/3 [05:11<10:23, 311.53s/it]

Initialization 0
  Iteration 10	 time lapse 7.13795s	 ll change 0.01576
  Iteration 20	 time lapse 6.62567s	 ll change 0.00119
Initialization converged: True	 time lapse 14.44509s	 ll -39.93936
Initialization 1
  Iteration 10	 time lapse 7.05213s	 ll change 0.00715
  Iteration 20	 time lapse 6.43239s	 ll change 0.00308
Initialization converged: True	 time lapse 17.38634s	 ll -39.96619
Initialization 2
  Iteration 10	 time lapse 7.34710s	 ll change 0.00975
  Iteration 20	 time lapse 6.47756s	 ll change 0.00306
Initialization converged: True	 time lapse 18.58171s	 ll -39.94032


 67%|██████▋   | 2/3 [10:04<05:00, 300.84s/it]

Initialization complete
Iteration 0, inertia 3476823.114524878.
Iteration 1, inertia 2612638.5889175395.
Iteration 2, inertia 2573770.8332747365.
Iteration 3, inertia 2551421.6946188607.
Iteration 4, inertia 2541163.024700252.
Iteration 5, inertia 2535338.7742865155.
Iteration 6, inertia 2530954.7286915155.
Iteration 7, inertia 2527332.3068461698.
Iteration 8, inertia 2524344.4174886006.
Iteration 9, inertia 2522057.3031755732.
Iteration 10, inertia 2520325.884709811.
Iteration 11, inertia 2519030.7724731797.
Iteration 12, inertia 2518023.514937206.
Iteration 13, inertia 2517204.3102503587.
Iteration 14, inertia 2516539.8905248493.
Iteration 15, inertia 2515965.248048659.
Iteration 16, inertia 2515446.0176345916.
Iteration 17, inertia 2514938.3936920734.
Iteration 18, inertia 2514452.1257295255.
Iteration 19, inertia 2513978.9697689004.
Iteration 20, inertia 2513542.1729426472.
Iteration 21, inertia 2513165.972632407.
Iteration 22, inertia 2512852.6255271705.
Iteration 23, inertia 2512

100%|██████████| 3/3 [13:58<00:00, 279.38s/it]


In [11]:
scores

[]

In [8]:
submission

Unnamed: 0,Id,Predicted
0,0,5
1,1,6
2,2,1
3,3,3
4,4,6
...,...,...
97995,97995,0
97996,97996,6
97997,97997,4
97998,97998,3
