# import Libs

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.mixture import GaussianMixture,BayesianGaussianMixture
from sklearn.preprocessing import *
from sklearn.decomposition import PCA
from yellowbrick.cluster import KElbowVisualizer

# import Data

In [None]:
train = pd.read_csv('../input/tabular-playground-series-jul-2022/data.csv')

In [None]:
train.head()

# Analysis

In [None]:
train.info()

In [None]:
train.drop('id',axis = 1).describe().T.style.background_gradient(cmap='Blues')

In [None]:
train.isna().sum()

hence there is no any column which contains null values, so we can continue our progress

# Data Visualization 

In [None]:
sns.set(rc={'figure.figsize':(25,25)})
for i, column in enumerate(list(train.columns), 1):
    plt.subplot(5,6,i)
    p=sns.histplot(x=column,data=train.sample(1000),stat='count',kde=True,color='orange')

In [None]:
float_cols = train.columns[train.dtypes == 'float']
int_cols = train.columns[train.dtypes == 'int']
float_cols, int_cols

In [None]:
sns.set(rc={'figure.figsize':(25,21)})
sns.heatmap(train.corr(),annot=True,fmt='.2f')

It's convenient not to use features that are correlated (hence redundant), when trying to make a proper clustering application. Thus, in this section, our main aim will be to analyse the different relationships between each of the features. Due to it, we'll start by calculating their correlation coefficients and showing them in a heatmap chart. Thus, we'll be able to determine which features are linearly related.

In [None]:
corr= train.loc[:,'f_00':].corr()
# Getting the Upper Triangle of the co-relation matrix
matrix = np.triu(corr)

fig, axes = plt.subplots(nrows = 1, ncols = 2, figsize = (22,8))
# Heatmap without absolute values
sns.heatmap(corr, mask=matrix, center = 0, cmap = 'vlag', ax = axes[0]).set_title('Without absolute values')
# Heatmap with absolute values
sns.heatmap(abs(corr), mask=matrix, center = 0, cmap = 'vlag', ax = axes[1]).set_title('With absolute values')

fig.tight_layout(h_pad=1.0, w_pad=0.5)

# Scalling the Data

In [None]:
scaled_data = pd.DataFrame(StandardScaler().fit_transform(train.drop('id', axis = 1)))
scaled_data.columns = train.columns[1:]

# Apply PCA

In [None]:
def apply_pca(X, transformer = False, components = -1):
    aux = X.copy()
    if transformer:
        X = pd.DataFrame(transformer.fit_transform(X))
        X.columns = aux.columns    
    # Create principal components
    if components == -1:
        pca = PCA()
    else:
        pca = PCA(n_components = components)
        
    X_pca = pca.fit_transform(X)
    # Convert to dataframe
    component_names = [f"PC{i+1}" for i in range(X_pca.shape[1])]
    X_pca = pd.DataFrame(X_pca, columns=component_names)
    # Create loadings
    loadings = pd.DataFrame(
        # transpose the matrix of loadings so the columns are the principal components and the rows are the original features
        pca.components_.T,  
        columns=component_names,
        index=X.columns,
    )
    return pca, X_pca, loadings

In [None]:
def plot_variance(pca, width=8, dpi=100):
    # Create figure
    fig, axs = plt.subplots(1, 2, figsize = (22,5))
    n = pca.n_components_
    grid = np.arange(1, n + 1)
    # Explained variance
    explainedVariance = pca.explained_variance_ratio_
        
    axs[0].bar(grid, explainedVariance)
    axs[0].set(
        xlabel="Component", title="% Explained Variance" 
    )
    # Cumulative Variance
    cv = np.cumsum(explainedVariance)
    axs[1].plot(np.r_[0, grid], np.r_[0, cv], "o-")
    axs[1].set(
        xlabel="Component", title="% Cumulative Variance"
    )
    # Set up figure
    #fig.set(figwidth=8, dpi=100)
    fig.tight_layout(h_pad=1.0, w_pad=0.5)
    return axs

In [None]:
pca = PCA(n_components=3)
p = pca.fit_transform(scaled_data)
plot_variance(pca)

# Clustering

In [None]:
model = KMeans()
visualizer = KElbowVisualizer(model, k=(4,12))

visualizer.fit(scaled_data)     
visualizer.show() 

# BGM

In [None]:
gmm = BayesianGaussianMixture(n_components = 7, covariance_type='full', random_state=1)
pred = gmm.fit_predict(scaled_data)

In [None]:
prop_cycle = plt.rcParams['axes.prop_cycle']
c = [prop_cycle.by_key()['color'][i % 10] for i in pred]

plt.figure(figsize=(8, 8))
plt.scatter(p[:,0], p[:,1], s=1, label=f"Cluster {i}", c=c)
plt.xlabel('PCA[0]')
plt.ylabel('PCA[1]')
plt.legend()
plt.title('PCA projection')
plt.show()

In [None]:
n_components = np.arange(1, 21)
models = [GaussianMixture(n, covariance_type='full', random_state=0).fit(scaled_data) for n in n_components]
plt.plot(n_components, [m.bic(scaled_data) for m in models], label='BIC')
plt.plot(n_components, [m.aic(scaled_data) for m in models], label='AIC')
plt.legend(loc='best')
plt.xlabel('n_components')

# Submission

In [None]:
sample_submission = pd.read_csv('../input/tabular-playground-series-jul-2022/sample_submission.csv')
sample_submission["Predicted"]= pred
sample_submission.to_csv('submission.csv', index=False)