## Execute Clustering Analysis

In [26]:
import pandas as pd
from pandas import DataFrame

from sklearn.cluster import KMeans
from sklearn.metrics import pairwise_distances
from sklearn.metrics import silhouette_score

from sklearn import preprocessing
le = preprocessing.LabelEncoder()
scalar = preprocessing.MinMaxScaler()

#### Define functions

In [30]:
def run_kmeans(_x: DataFrame, n_clusters: int) -> None:
    kmeans = KMeans(n_clusters=n_clusters, random_state=1)
    kmeans.fit(_x)
    print(kmeans.cluster_centers_)
    print(kmeans.labels_)
    print(kmeans.inertia_)

#### Read training dataset from pickle file

In [8]:
master: DataFrame = pd.read_pickle('./data/master.pickle')
print(master.shape)

#### Select features for the classification analysis

In [9]:
features_0 = ['Weekly_Sales', 'IsHoliday', 'Temperature', 'Fuel_Price', 'MarkDown1', 'MarkDown2', 'MarkDown3', 
              'MarkDown4', 'MarkDown5', 'CPI', 'Unemployment', 'Type', 'Size', 'Store', 'Dept', 'Year', 'Month']
categorical = ['IsHoliday', 'Type']

In [21]:
master_1: DataFrame = master[features_0]
print(master_1.shape)
print(master_1.info())
print(master_1.shape)

In [18]:
master_2: DataFrame = master_1.apply(le.fit_transform)
print(master_2.shape)
print(master_2.head())

In [20]:
master_3 = scalar.fit_transform(master_2)

In [22]:
master_4 = pd.DataFrame(master_3, columns=features_0)

In [23]:
print(master_4.shape)
print(master_4.head())

#### Running classification models

In [31]:
run_kmeans(master_4, 2)

[[4.71775927e-01 1.10673397e-01 3.31883749e-01 6.76905382e-01
  5.80834083e-01 4.79241226e-01 4.54969027e-01 5.06261807e-01
  5.75918558e-01 4.17572009e-01 4.22434938e-01 2.42298104e-01
  5.52891126e-01 4.58303722e-01 4.85710521e-01 8.52571402e-01
  3.81807270e-01]
 [4.46338872e-01 6.97325186e-02 5.48147381e-01 4.37522347e-01
  3.47593667e-03 6.93361789e-03 1.23451576e-02 7.43228958e-04
  4.97970123e-03 2.96823657e-01 5.64363375e-01 3.06656107e-01
  4.86671678e-01 4.90875546e-01 4.88718969e-01 2.56926592e-01
  4.96428912e-01]]
[1 1 1 ... 0 0 0]
377819.35628877743


In [32]:
run_kmeans(master_4, 4)

[[3.75211142e-01 7.24683523e-02 5.47597378e-01 4.59073109e-01
  9.72843621e-03 1.47490441e-02 2.48109649e-02 2.59629489e-03
  1.36864427e-02 2.82744499e-01 5.92120956e-01 5.81358032e-01
  2.31657168e-01 5.16374378e-01 4.89971104e-01 2.85375851e-01
  4.92323148e-01]
 [4.79605624e-01 1.06689838e-01 3.26813129e-01 6.80200346e-01
  6.02982954e-01 4.92085612e-01 4.55933174e-01 5.30693471e-01
  5.94374725e-01 4.10661972e-01 4.22074959e-01 2.15799646e-01
  5.72758820e-01 4.47384468e-01 4.84715207e-01 8.51123094e-01
  3.85624283e-01]
 [5.25505480e-01 6.95029197e-02 5.45540224e-01 4.17299834e-01
  8.72030046e-04 4.15594546e-03 8.99249644e-03 2.85297408e-04
  1.10454834e-03 3.19466504e-01 5.28775029e-01 3.42603946e-05
  7.73704728e-01 4.65688215e-01 4.87660738e-01 2.40214851e-01
  4.96404735e-01]]
[2 2 2 ... 1 1 1]
331380.3504466621


#### Running finalized model on test set

In [43]:
test: DataFrame = pd.read_pickle('./data/test.pickle')
print(test.shape)