# 1. Importing Libraries

In [25]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import PowerTransformer, RobustScaler
from sklearn.mixture import GaussianMixture, BayesianGaussianMixture
from hmmlearn import hmm
from sklearn.pipeline import Pipeline
from sklearn.metrics import silhouette_score, davies_bouldin_score

import matplotlib.pyplot as plt
import seaborn as sns

import time
import os

from warnings import filterwarnings
filterwarnings('ignore')

# 2. Importing Data

In [26]:
# Importing data:
data = pd.read_csv('../Datasets/raw_datasets/data.csv', index_col='id')
submission  = pd.read_csv('../Datasets/raw_datasets/sample_submission.csv')

# Making a deep copy of the data
data_copy = data.copy(deep = True)

In [27]:
# integer data
int_data = data_copy.select_dtypes(int)

# interger data column names as a list
int_data_cols = int_data.columns.to_list()

# float data
float_data = data_copy.select_dtypes(float)

# float data column names as a list
float_data_cols = float_data.columns.to_list()

# data_copy column names as a list
data_copy_cols_list = data_copy.columns.to_list()

# non-normal float data
non_norm_float_data_list = data_copy_cols_list[22:29]

# Selected Features
selected_features_cols_list = int_data_cols + non_norm_float_data_list
selected_data = data_copy[selected_features_cols_list]

# 3. Transforming Data

In [28]:
robust_scaler_power_transformer = Pipeline([
    ('robust_scaler', RobustScaler()),
    ('power_transformer', PowerTransformer())
])

# Apply transformation
transformed_data = robust_scaler_power_transformer.fit_transform(selected_data)

# 4. Defining the number of clusters and the path to save output submission files

In [29]:
# Define the number of clusters to try
n_clusters = 7

# Output path
output_path = '../Datasets/final/'

# Ensure the output directory exists
os.makedirs(output_path, exist_ok=True)

# Initialize the results dictionary
results = {}

# 5. Defining Various Functions for Clustering

In [30]:
# Function to save results
def save_results(labels, method, transformer_name):
    output_df = pd.DataFrame({'Id': data.index, 'Predicted': labels})
    filename = f"{output_path}{method}_{n_clusters}_clusters_{transformer_name}.csv"
    output_df.to_csv(filename, index=False)

In [31]:
# Function to fit GMM and evaluate
def fit_gmm(data, transformer_name):
    print(f"Training GMM with {n_clusters} clusters using {transformer_name}...")
    start_time = time.time()
    model = GaussianMixture(n_components=n_clusters, random_state=42)
    model.fit(data)
    labels = model.predict(data)
    silhouette = silhouette_score(data, labels)
    davies_bouldin = davies_bouldin_score(data, labels)
    end_time = time.time()
    print(f"GMM with {n_clusters} clusters using {transformer_name} completed in {end_time - start_time:.2f} seconds.")
    return silhouette, davies_bouldin, labels

In [32]:
# Function to fit BGMM and evaluate
def fit_bgmm(data, transformer_name):
    print(f"Training BGMM with {n_clusters} clusters using {transformer_name}...")
    start_time = time.time()
    model = BayesianGaussianMixture(n_components=n_clusters, random_state=42)
    model.fit(data)
    labels = model.predict(data)
    silhouette = silhouette_score(data, labels)
    davies_bouldin = davies_bouldin_score(data, labels)
    end_time = time.time()
    print(f"BGMM with {n_clusters} clusters using {transformer_name} completed in {end_time - start_time:.2f} seconds.")
    return silhouette, davies_bouldin, labels

# 6. Training the clustering Algorithms and Saving the outputs for Submission csv files.

In [33]:
# Fit GMM
silhouette, davies_bouldin, labels = fit_gmm(transformed_data, "robust_scaler_power_transformer")
results['GMM'] = {
    'Silhouette Score': silhouette,
    'Davies Bouldin Score': davies_bouldin,
    'Labels': labels
}
save_results(labels, "GMM", "robust_scaler_power_transformer")

# Fit BGMM
silhouette, davies_bouldin, labels = fit_bgmm(transformed_data, "robust_scaler_power_transformer")
results['BGMM'] = {
    'Silhouette Score': silhouette,
    'Davies Bouldin Score': davies_bouldin,
    'Labels': labels
}
save_results(labels, "BGMM", "robust_scaler_power_transformer")

Training GMM with 7 clusters using robust_scaler_power_transformer...
GMM with 7 clusters using robust_scaler_power_transformer completed in 127.19 seconds.
Training BGMM with 7 clusters using robust_scaler_power_transformer...
BGMM with 7 clusters using robust_scaler_power_transformer completed in 137.25 seconds.


In [34]:
# Print results
for model in results:
    print(f"Model: {model}")
    print(f"Silhouette Score: {results[model]['Silhouette Score']}")
    print(f"Davies Bouldin Score: {results[model]['Davies Bouldin Score']}")
    print('-'*30)

Model: GMM
Silhouette Score: 0.03820238432848022
Davies Bouldin Score: 3.5648224880234394
------------------------------
Model: BGMM
Silhouette Score: 0.037749610172124484
Davies Bouldin Score: 3.586824312339798
------------------------------


# 7. Checking the same process as done above for other number of clusters as well

In [35]:
# # Define the transformations
# power_transformer = PowerTransformer()

# robust_scaler_power_transformer = Pipeline([
#     ('robust_scaler', RobustScaler()),
#     ('power_transformer', PowerTransformer())
# ])

# # Define the number of clusters to try
# n_clusters_list = [12, 6, 8, 7]

# # Output path
# output_path = '../Datasets/final/'

# # Ensure the output directory exists
# os.makedirs(output_path, exist_ok=True)

# # Initialize the results dictionary
# results = {}

# # Function to save results
# def save_results(labels, method, n_clusters, transformer_name, index):
#     output_df = pd.DataFrame({'Id': index, 'Predicted': labels})
#     filename = f"{output_path}{method}_{n_clusters}_clusters_{transformer_name}.csv"
#     output_df.to_csv(filename, index=False)

# # Function to fit GMM and evaluate
# def fit_gmm(data, n_clusters, transformer_name):
#     print(f"Training GMM with {n_clusters} clusters using {transformer_name}...")
#     start_time = time.time()
#     model = GaussianMixture(n_components=n_clusters, random_state=42)
#     model.fit(data)
#     labels = model.predict(data)
#     bic = model.bic(data)
#     silhouette = silhouette_score(data, labels)
#     davies_bouldin = davies_bouldin_score(data, labels)
#     end_time = time.time()
#     print(f"GMM with {n_clusters} clusters using {transformer_name} completed in {end_time - start_time:.2f} seconds.")
#     return bic, silhouette, davies_bouldin, labels

# # Function to fit BGMM and evaluate
# def fit_bgmm(data, n_clusters, transformer_name):
#     print(f"Training BGMM with {n_clusters} clusters using {transformer_name}...")
#     start_time = time.time()
#     model = BayesianGaussianMixture(n_components=n_clusters, random_state=42)
#     model.fit(data)
#     labels = model.predict(data)
#     silhouette = silhouette_score(data, labels)
#     davies_bouldin = davies_bouldin_score(data, labels)
#     end_time = time.time()
#     print(f"BGMM with {n_clusters} clusters using {transformer_name} completed in {end_time - start_time:.2f} seconds.")
#     return None, silhouette, davies_bouldin, labels  # BIC not available for BGMM

# # Function to fit DPGMM and evaluate
# def fit_dpgmm(data, n_clusters, transformer_name):
#     print(f"Training DPGMM with {n_clusters} clusters using {transformer_name}...")
#     start_time = time.time()
#     model = BayesianGaussianMixture(n_components=n_clusters, covariance_type='full', weight_concentration_prior_type='dirichlet_process', random_state=42)
#     model.fit(data)
#     labels = model.predict(data)
#     silhouette = silhouette_score(data, labels)
#     davies_bouldin = davies_bouldin_score(data, labels)
#     end_time = time.time()
#     print(f"DPGMM with {n_clusters} clusters using {transformer_name} completed in {end_time - start_time:.2f} seconds.")
#     return None, silhouette, davies_bouldin, labels  # BIC not available for DPGMM

# # Function to fit HMM and evaluate
# def fit_hmm(data, n_clusters, transformer_name):
#     print(f"Training HMM with {n_clusters} clusters using {transformer_name}...")
#     start_time = time.time()
#     model = hmm.GaussianHMM(n_components=n_clusters, covariance_type="full", random_state=42)
#     model.fit(data)
#     labels = model.predict(data)
#     silhouette = silhouette_score(data, labels)
#     davies_bouldin = davies_bouldin_score(data, labels)
#     end_time = time.time()
#     print(f"HMM with {n_clusters} clusters using {transformer_name} completed in {end_time - start_time:.2f} seconds.")
#     return None, silhouette, davies_bouldin, labels  # HMM does not provide BIC directly

# # Apply transformations and fit models
# for transformer, transformer_name in zip([power_transformer, robust_scaler_power_transformer], ['power_transformer', 'robust_scaler_power_transformer']):
#     transformed_data = transformer.fit_transform(selected_data)
#     results[transformer_name] = {}
    
#     for n_clusters in n_clusters_list:
#         results[transformer_name][n_clusters] = {}

#         # GMM
#         bic, silhouette, davies_bouldin, labels = fit_gmm(transformed_data, n_clusters, transformer_name)
#         results[transformer_name][n_clusters]['GMM'] = {
#             'BIC': bic,
#             'Silhouette Score': silhouette,
#             'Davies Bouldin Score': davies_bouldin,
#             'Labels': labels
#         }
#         save_results(labels, "GMM", n_clusters, transformer_name, data.index)

#         # BGMM
#         bic, silhouette, davies_bouldin, labels = fit_bgmm(transformed_data, n_clusters, transformer_name)
#         results[transformer_name][n_clusters]['BGMM'] = {
#             'BIC': bic,
#             'Silhouette Score': silhouette,
#             'Davies Bouldin Score': davies_bouldin,
#             'Labels': labels
#         }
#         save_results(labels, "BGMM", n_clusters, transformer_name, data.index)

#         # DPGMM
#         bic, silhouette, davies_bouldin, labels = fit_dpgmm(transformed_data, n_clusters, transformer_name)
#         results[transformer_name][n_clusters]['DPGMM'] = {
#             'BIC': bic,
#             'Silhouette Score': silhouette,
#             'Davies Bouldin Score': davies_bouldin,
#             'Labels': labels
#         }
#         save_results(labels, "DPGMM", n_clusters, transformer_name, data.index)

#         # HMM
#         _, silhouette, davies_bouldin, labels = fit_hmm(transformed_data, n_clusters, transformer_name)
#         results[transformer_name][n_clusters]['HMM'] = {
#             'BIC': None,  # Not available for HMM
#             'Silhouette Score': silhouette,
#             'Davies Bouldin Score': davies_bouldin,
#             'Labels': labels
#         }
#         save_results(labels, "HMM", n_clusters, transformer_name, data.index)

# # Print results
# for transformer in results:
#     print(f"Results for {transformer}:")
#     for n_clusters in results[transformer]:
#         print(f"Number of clusters: {n_clusters}")
#         for model in results[transformer][n_clusters]:
#             print(f"Model: {model}")
#             print(f"BIC: {results[transformer][n_clusters][model]['BIC']}")
#             print(f"Silhouette Score: {results[transformer][n_clusters][model]['Silhouette Score']}")
#             print(f"Davies Bouldin Score: {results[transformer][n_clusters][model]['Davies Bouldin Score']}")
#             print('-'*30)