In [1]:
pip install scikit-learn-extra

Collecting scikit-learn-extra
  Downloading scikit_learn_extra-0.2.0-cp37-cp37m-manylinux2010_x86_64.whl (1.7 MB)
[?25l[K     |▏                               | 10 kB 22.1 MB/s eta 0:00:01[K     |▍                               | 20 kB 6.3 MB/s eta 0:00:01[K     |▋                               | 30 kB 8.6 MB/s eta 0:00:01[K     |▊                               | 40 kB 7.7 MB/s eta 0:00:01[K     |█                               | 51 kB 4.1 MB/s eta 0:00:01[K     |█▏                              | 61 kB 4.3 MB/s eta 0:00:01[K     |█▍                              | 71 kB 4.6 MB/s eta 0:00:01[K     |█▌                              | 81 kB 5.2 MB/s eta 0:00:01[K     |█▊                              | 92 kB 3.9 MB/s eta 0:00:01[K     |██                              | 102 kB 4.2 MB/s eta 0:00:01[K     |██                              | 112 kB 4.2 MB/s eta 0:00:01[K     |██▎                             | 122 kB 4.2 MB/s eta 0:00:01[K     |██▌                       

#Importing Modules

In [2]:
import pickle

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.decomposition import PCA
from sklearn.mixture import GaussianMixture
from sklearn.cluster import KMeans, Birch, MiniBatchKMeans
from sklearn.metrics import f1_score, accuracy_score
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.preprocessing import StandardScaler

from scipy.stats import multivariate_normal

#Importing Dataset

In [3]:
df = pd.read_csv('covtype_train.zip')
df.head()

Unnamed: 0,Elevation,Aspect,Slope,Hillshade_9am,Hillshade_Noon,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Fire_Points,Soil_Type,Wilderness,target
0,elevation_medium,aspect_medium,slope_low,hillshade_9am_max,hillnoon_max,0,1,low,22,0,2
1,elevation_high,aspect_medium,slope_low,hillshade_9am_max,hillnoon_max,1,1,mid,32,2,1
2,elevation_medium,aspect_low,slope_low,hillshade_9am_max,hillnoon_max,1,1,low,10,2,2
3,elevation_high,aspect_ultra,slope_medium,hillshade_9am_max,hillnoon_max,2,1,low,23,2,1
4,elevation_high,aspect_high,slope_low,hillshade_9am_max,hillnoon_max,2,1,mid,28,0,2


#Encoding Categorical Values

In [4]:
df = pd.get_dummies(df)

In [5]:
df.head()

Unnamed: 0,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Soil_Type,Wilderness,target,Elevation_elevation_high,Elevation_elevation_low,Elevation_elevation_medium,Elevation_elevation_ultra,Aspect_aspect_high,Aspect_aspect_low,Aspect_aspect_medium,Aspect_aspect_ultra,Slope_slope_high,Slope_slope_low,Slope_slope_medium,Slope_slope_ultra,Hillshade_9am_hillshade_9am_max,Hillshade_9am_hillshade_9am_min,Hillshade_Noon_hillnoon_max,Hillshade_Noon_hillnoon_min,Horizontal_Distance_To_Fire_Points_high,Horizontal_Distance_To_Fire_Points_low,Horizontal_Distance_To_Fire_Points_mid
0,0,1,22,0,2,0,0,1,0,0,0,1,0,0,1,0,0,1,0,1,0,0,1,0
1,1,1,32,2,1,1,0,0,0,0,0,1,0,0,1,0,0,1,0,1,0,0,0,1
2,1,1,10,2,2,0,0,1,0,0,1,0,0,0,1,0,0,1,0,1,0,0,1,0
3,2,1,23,2,1,1,0,0,0,0,0,0,1,0,0,1,0,1,0,1,0,0,1,0
4,2,1,28,0,2,1,0,0,0,1,0,0,0,0,1,0,0,1,0,1,0,0,0,1


#Spliting into X and y values

In [6]:
y = df['target']
X = df.drop(columns=['target'])

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=0)

#Feature Scaling

In [8]:
standardScaler = StandardScaler()

X_train = standardScaler.fit_transform(X_train)
X_test = standardScaler.transform(X_test)
X = standardScaler.fit_transform(X)

In [9]:
with open('standardScaler','wb+') as file:
    pickle.dump(standardScaler,file)

#Grid Search

In [10]:
def tune_hyperparameters(model,X,y):
  param_grid = {
      'n_init' : np.arange(5,16),
      'max_iter' : np.arange(100,401,50),
      'algorithm' : ['auto', 'full', 'elkan']
  }
  grid_search = GridSearchCV(model,param_grid=param_grid)
  grid_search.fit(X,y)
  print("Best Params: ",grid_search.best_params_)
  return grid_search.best_params_

In [11]:
# best_params = tune_hyperparameters(KMeans(random_state=0,n_clusters=n_clusters),X,y)

In [12]:
best_params = {'algorithm': 'auto', 'max_iter': 100, 'n_init': 8}

In [13]:
n_clusters = 7

#K-Means

In [14]:
kmeans = KMeans(n_clusters=n_clusters,random_state=0,**best_params).fit(X)

#Mapping clusters

In [15]:
def find_mapping(labels, X, y):
  # labels - contain values from 0 to 6
  # y contain values from 1 to 7

  pref = {}
  for clus in range(7):
    temp = {}
    for j in range(1,8):
      cnt = 0
      for k in range(len(y)):
        cnt += (y[k] == j and labels[k] == clus)
      temp[j] = cnt
    temp = dict(sorted(temp.items(), key=lambda item: -item[1]))
    order = []
    for i in temp: order.append(i)
  
    pref[clus] = order

  perm = []
  vis = [0] * (10)

  for i in range(7):
    for j in pref[i]:
      if(vis[j]): continue
      else:
        perm.append(j);
        vis[j] = 1
        break
  
  return perm;

#Prediction

### Full Dataset

In [16]:
order = (find_mapping(list(kmeans.predict(X)),X,list(y)))
print(order)

[1, 2, 7, 5, 3, 6, 4]


In [17]:
label = kmeans.predict(X)
for i in range(len(label)):
  label[i] = order[label[i]]

print(f1_score(y,label,average = 'weighted'))

0.4207065081867018


### Training Dataset

In [18]:
order = (find_mapping(list(kmeans.predict(X_train)),X_train,list(y_train)))
print(order)

[1, 2, 6, 7, 3, 5, 4]


In [19]:
order = [1, 2, 7, 5, 3, 6, 4]

In [20]:
label = kmeans.predict(X_train)
for i in range(len(label)):
  label[i] = order[label[i]]

print(f1_score(y_train,label,average = 'weighted'))

0.42083496813276416


### Test Dataset

In [21]:
order = (find_mapping(list(kmeans.predict(X_test)),X_test,list(y_test)))
print(order)

[1, 2, 5, 7, 3, 6, 4]


In [22]:
order = [1, 2, 7, 5, 3, 6, 4]

In [23]:
label = kmeans.predict(X_test)
for i in range(len(label)):
  label[i] = order[label[i]]

print(f1_score(y_test,label,average = 'weighted'))

0.4203184202811393


In [24]:
with open('model','wb+') as file:
    pickle.dump(kmeans,file)