# [TLDR] exploratory work with KMeans

# [LONGER VERSION]
- much like `AgglomerativeClustering`, `KMeans` seems more adept at identifying and encoding clusters than for my problem at hand

# [RESULT] PASS

# IMPORTS

In [None]:
%run ipynb_setup.ipynb

In [None]:
%run class_DataSet.ipynb

# GET DATASET

In [None]:
d=Dataset()

In [None]:
d.raw.head(2) # raw data

In [None]:
d.df.head(2) # numeric data

# PREP DATA FOR K-MEANS

In [None]:
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans

# SCALE NUMERICAL DATA

### get

In [None]:
df_num = d.df.select_dtypes(include=['int64','float64'])
df_num

In [None]:
df_num.describe()

### impute nans

In [None]:
df_num.isna().sum() # check nans

In [None]:
# look at rows with nans
d.raw.select_dtypes(include=['int64','float64']).loc[
    df_num[
        d.raw['rating_five_count'].isna()
    ].index
]

##### manually impute

In [None]:
mean_values=df_num.mean(axis=0) # populate nans with mean
mean_values

In [None]:
df_num = df_num.fillna(mean_values)

##### sklearn impute

In [None]:
from sklearn.impute import SimpleImputer

In [None]:
imp = SimpleImputer(missing_values=np.nan, strategy='mean') # impute with mean
imputed = imp.fit_transform(df_num)
imputed

In [None]:
df_num_imputed = pd.DataFrame(imputed,index=df_num.index,columns=df_num.columns)
df_num_imputed.isna().sum() # check no more nans

### scale

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
scaler = StandardScaler()
scaled_features = scaler.fit_transform(df_num_imputed)

In [None]:
df_num_imputed_scaled = pd.DataFrame(scaled_features, index=df_num.index, columns=df_num.columns)

In [None]:
df_num_imputed_scaled.describe()

### check no nans 

In [None]:
df_num_imputed_scaled.isna().sum() # ensure no more nans

# APPLY K-MEANS
- partitional clustering
- hierarchical clustering / agglomerative vs divisive
- density-based clustering

### figure out best `k` to use

In [None]:
# A list holds the SSE values for each k
sse     = {}
k_range = range(1, 50) 
for k in k_range:
    kmeans = KMeans(
        init         = "random",
        n_clusters   = k, 
        n_init       = 10,
        max_iter     = 300,
        random_state = 42
    )
    kmeans.fit(scaled_features)
    sse[k] = kmeans.inertia_

In [None]:
pd.Series(sse).plot() # see how residual decays with increasing `k`, looks like 10 groups is pretty good

### do fit for single `k`

In [None]:
kmeans = KMeans(
    init         = "random",
    n_clusters   = 15,
    n_init       = 10,
    max_iter     = 300,
    random_state = 42
)

In [None]:
kmeans.fit(df_num_imputed_scaled) # apply kmeans on scaled features cos kmeans metric sensitive to scale

In [None]:
kmeans.labels_ # predictions

In [None]:
pd.Series(kmeans.labels_).unique()

In [None]:
df_num_imputed_scaled.shape

In [None]:
len(pd.Series(kmeans.labels_).unique())

# REPEAT WITH PIPELINE

In [None]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import make_pipeline

### [figure out best `k` to use] Impute

In [None]:
# pipeline
pl = make_pipeline(
    SimpleImputer(missing_values=np.nan, strategy='mean'), # PCA can't have nans
)

reduced = pl.fit_transform(df_num)

# A list holds the SSE values for each k
sse     = {}
k_range = range(1, 50) 
for k in k_range:
    kmeans = KMeans(
        init         = "random",
        n_clusters   = k, 
        n_init       = 10,
        max_iter     = 300,
        random_state = 42
    )
    kmeans.fit(reduced)
    sse[k] = kmeans.inertia_

pd.Series(sse).plot()

### [figure out best `k` to use] Impute + StandardScaler

In [None]:
# pipeline
pl = make_pipeline(
    SimpleImputer(missing_values=np.nan, strategy='mean'), # PCA can't have nans
    StandardScaler(), # apply standard scaler first, PCA super sensitive to scaling
)

reduced = pl.fit_transform(df_num)

# A list holds the SSE values for each k
sse     = {}
k_range = range(1, 50) 
for k in k_range:
    kmeans = KMeans(
        init         = "random",
        n_clusters   = k, 
        n_init       = 10,
        max_iter     = 300,
        random_state = 42
    )
    kmeans.fit(reduced)
    sse[k] = kmeans.inertia_

pd.Series(sse).plot()

### [figure out best `k` to use] Impute + StandardScaler + PCA

In [None]:
# pipeline
pl = make_pipeline(
    SimpleImputer(missing_values=np.nan, strategy='mean'), # PCA can't have nans
    StandardScaler(), # apply standard scaler first, PCA super sensitive to scaling
    PCA(n_components=5),
)

reduced = pl.fit_transform(df_num)

# A list holds the SSE values for each k
sse     = {}
k_range = range(1, 50) 
for k in k_range:
    kmeans = KMeans(
        init         = "random",
        n_clusters   = k, 
        n_init       = 10,
        max_iter     = 300,
        random_state = 42
    )
    kmeans.fit(reduced)
    sse[k] = kmeans.inertia_

pd.Series(sse).plot()

### [figure out best `k` to use] Impute + StandardScaler + MinMaxScaler

In [None]:
# pipeline
pl = make_pipeline(
    SimpleImputer(missing_values=np.nan, strategy='mean'), # PCA can't have nans
    StandardScaler(), # apply standard scaler first, PCA super sensitive to scaling
    MinMaxScaler(feature_range=(-1,1)), # consider apply minmax scaler on 1 sd of the data as well smooth outliers
)

reduced = pl.fit_transform(df_num)

# A list holds the SSE values for each k
sse     = {}
k_range = range(1, 50) 
for k in k_range:
    kmeans = KMeans(
        init         = "random",
        n_clusters   = k, 
        n_init       = 10,
        max_iter     = 300,
        random_state = 42
    )
    kmeans.fit(reduced)
    sse[k] = kmeans.inertia_

pd.Series(sse).plot()

### [figure out best `k` to use] Impute + StandardScaler + MinMaxScaler + PCA

In [None]:
# pipeline
pl = make_pipeline(
    SimpleImputer(missing_values=np.nan, strategy='mean'), # PCA can't have nans
    StandardScaler(), # apply standard scaler first, PCA super sensitive to scaling
    MinMaxScaler(feature_range=(-1,1)), # consider apply minmax scaler on 1 sd of the data as well smooth outliers
    PCA(n_components=5),
)

reduced = pl.fit_transform(df_num)

# A list holds the SSE values for each k
sse     = {}
k_range = range(1, 50) 
for k in k_range:
    kmeans = KMeans(
        init         = "random",
        n_clusters   = k, 
        n_init       = 10,
        max_iter     = 300,
        random_state = 42
    )
    kmeans.fit(reduced)
    sse[k] = kmeans.inertia_

pd.Series(sse).plot()