**Author:** [Riccardo Guidotti](http://kdd.isti.cnr.it/people/riccardo-guidotti)  
**Python version:**  3.x

In [1]:
%matplotlib inline

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from collections import defaultdict
from sklearn.model_selection import train_test_split, cross_val_score 

from sklearn.metrics import accuracy_score, f1_score, classification_report
from sklearn.metrics import roc_curve, auc, roc_auc_score
from sklearn.metrics import silhouette_samples, silhouette_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler

# Data Preparation

In [2]:
class_name = 'Occupancy'
df = pd.read_csv(r'/Users/Cava/Desktop/University/Data Science & Business Informatics/Data Mining/Advanced Topics and Applications/Project/data/training.csv')
columns2remove = ['Unnamed: 0']
df.drop(columns2remove, inplace=True, axis=1)
df['date'] =  pd.to_datetime(df['date'], format = '%Y-%m-%d %H:%M:%S')
df.sort_values('date', inplace = True)
df.reset_index(inplace = True, drop = True)
df.set_index('date', inplace=True)
df

Unnamed: 0_level_0,Temperature,Humidity,Light,CO2,HumidityRatio,Occupancy
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2015-02-02 14:19:00,23.7000,26.2720,585.200000,749.200000,0.004764,1
2015-02-02 14:19:59,23.7180,26.2900,578.400000,760.400000,0.004773,1
2015-02-02 14:21:00,23.7300,26.2300,572.666667,769.666667,0.004765,1
2015-02-02 14:22:00,23.7225,26.1250,493.750000,774.750000,0.004744,1
2015-02-02 14:23:00,23.7540,26.2000,488.600000,779.000000,0.004767,1
...,...,...,...,...,...,...
2015-02-18 09:14:00,20.8150,27.4175,430.750000,1511.750000,0.004167,1
2015-02-18 09:16:00,20.8650,27.7450,423.500000,1514.500000,0.004230,1
2015-02-18 09:16:59,20.8900,27.7450,423.500000,1521.500000,0.004237,1
2015-02-18 09:17:59,20.8900,28.0225,418.750000,1632.000000,0.004279,1


In [3]:
class_name = 'Occupancy'
test = pd.read_csv(r'/Users/Cava/Desktop/University/Data Science & Business Informatics/Data Mining/Advanced Topics and Applications/Project/data/test.csv')
columns2remove = ['Unnamed: 0']
test.drop(columns2remove, inplace=True, axis=1)
test['date'] =  pd.to_datetime(test['date'], format = '%Y-%m-%d %H:%M:%S')
test.sort_values('date', inplace = True)
test.reset_index(inplace = True, drop = True)
test.set_index('date', inplace=True)
test

Unnamed: 0_level_0,Temperature,Humidity,Light,CO2,HumidityRatio,Occupancy
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2015-02-02 14:19:00,23.700,26.2720,585.200000,749.200000,0.004764,1
2015-02-02 14:19:59,23.718,26.2900,578.400000,760.400000,0.004773,1
2015-02-02 14:21:00,23.730,26.2300,572.666667,769.666667,0.004765,1
2015-02-02 14:23:00,23.754,26.2000,488.600000,779.000000,0.004767,1
2015-02-02 14:25:00,23.730,26.2900,536.333333,798.000000,0.004776,1
...,...,...,...,...,...,...
2015-02-18 09:11:59,20.790,27.0300,419.000000,1488.000000,0.004101,1
2015-02-18 09:15:00,20.815,27.7175,429.750000,1505.250000,0.004213,1
2015-02-18 09:16:00,20.865,27.7450,423.500000,1514.500000,0.004230,1
2015-02-18 09:16:59,20.890,27.7450,423.500000,1521.500000,0.004237,1


In [4]:
attributes = [c for c in df.columns if (c != class_name) & (c != 'HumidityRatio')]

In [5]:
X_train = df[attributes].values
y_train = df[class_name]

X_train.shape 

(14391, 4)

In [6]:
X_test = test[attributes].values
y_test = test[class_name]

X_test.shape

(6168, 4)

# Gaussian Mixture

In [7]:
from sklearn.mixture import GaussianMixture

covariance_type

- 'full': each component has its own general covariance matrix
- 'tied': all components share the same general covariance matrix
- 'diag': each component has its own diagonal covariance matrix
- 'spherical': each component has its own single variance

In [8]:
gm = GaussianMixture(n_components=1, random_state=0, covariance_type='full')
gm.fit(X_train)

y_pred = gm.predict(X_test)

print('Accuracy %s' % accuracy_score(y_test, y_pred))
print('F1-score %s' % f1_score(y_test, y_pred, average=None))

Accuracy 0.7696173800259404
F1-score [0.86981219 0.        ]


In [9]:
gm = GaussianMixture(n_components=2, random_state=0, covariance_type='full')
gm.fit(X_train)

y_pred = gm.predict(X_test)

print('Accuracy %s' % accuracy_score(y_test, y_pred))
print('F1-score %s' % f1_score(y_test, y_pred, average=None))

Accuracy 0.8524643320363164
F1-score [0.89398882 0.75746269]


In [10]:
gm = GaussianMixture(n_components=2, random_state=0, covariance_type='tied')
gm.fit(X_train)

y_pred = gm.predict(X_test)

print('Accuracy %s' % accuracy_score(y_test, y_pred))
print('F1-score %s' % f1_score(y_test, y_pred, average=None))

Accuracy 0.9787613488975356
F1-score [0.98601772 0.95584766]


In [11]:
gm.means_

array([[ 20.54716596,  27.62388939,  15.06346843, 606.92424839],
       [ 21.98427193,  27.7732006 , 478.23017396, 951.47880201]])

In [12]:
gm.covariances_

array([[ 7.27842076e-01, -8.44605901e-01,  2.96489455e+01,
         5.51265538e+01],
       [-8.44605901e-01,  2.46413809e+01, -4.34404981e+01,
         4.65849128e+02],
       [ 2.96489455e+01, -4.34404981e+01,  4.46961105e+03,
        -6.57766048e+02],
       [ 5.51265538e+01,  4.65849128e+02, -6.57766048e+02,
         7.61462220e+04]])

In [13]:
gm.precisions_

array([[ 2.24459552e+00,  9.58677720e-02, -1.43013002e-02,
        -2.33502909e-03],
       [ 9.58677720e-02,  5.07290590e-02, -1.99033859e-04,
        -3.81474809e-04],
       [-1.43013002e-02, -1.99033859e-04,  3.18773758e-04,
         1.43248039e-05],
       [-2.33502909e-03, -3.81474809e-04,  1.43248039e-05,
         1.72806234e-05]])

# X-Means

In [14]:
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_train = pd.DataFrame(X_train)
X_test = scaler.fit_transform(X_test)
X_test = pd.DataFrame(X_test)

https://github.com/annoviko/pyclustering/

In [15]:
from pyclustering.cluster import xmeans

In [17]:
xm = xmeans.xmeans(X_train)
xm.process()

KeyError: 4

In [None]:
clusters = xm.get_clusters()

In [None]:
centers = xm.get_centers()

In [None]:
i = df.columns.values.tolist().index('Temperature')
j = df.columns.values.tolist().index('CO2')

In [None]:
for indexes in clusters:
    plt.scatter(X_train[indexes,i], X_train[indexes,j], alpha=0.4)
for c in centers:
    plt.scatter(c[i], c[j], s=100, edgecolors='k')

In [None]:
i = df.columns.values.tolist().index('Temperature')
j = df.columns.values.tolist().index('Humidity')

In [None]:
for indexes in clusters:
    plt.scatter(X_train[indexes,i], X_train[indexes,j], alpha=0.4)
for c in centers:
    plt.scatter(c[i], c[j], s=100, edgecolors='k')

In [None]:
i = df.columns.values.tolist().index('CO2')
j = df.columns.values.tolist().index('Humidity')

In [None]:
for indexes in clusters:
    plt.scatter(X_train[indexes,i], X_train[indexes,j], alpha=0.4)
for c in centers:
    plt.scatter(c[i], c[j], s=100, edgecolors='k')

In [None]:
i = df.columns.values.tolist().index('CO2')
j = df.columns.values.tolist().index('Light')

In [None]:
for indexes in clusters:
    plt.scatter(X_train[indexes,i], X_train[indexes,j], alpha=0.4)
for c in centers:
    plt.scatter(c[i], c[j], s=100, edgecolors='k')

# K-Mode

https://github.com/nicodv/kmodes

In [18]:
from kmodes.kmodes import KModes

In [19]:
km = KModes(n_clusters=10, init='Huang', n_init=5, verbose=1)

clusters = km.fit_predict(X_train)

Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 1, iteration: 1/100, moves: 632, cost: 44099.0
Run 1, iteration: 2/100, moves: 192, cost: 44099.0
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 2, iteration: 1/100, moves: 442, cost: 44166.0
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 3, iteration: 1/100, moves: 558, cost: 43929.0
Run 3, iteration: 2/100, moves: 54, cost: 43929.0
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 4, iteration: 1/100, moves: 748, cost: 43844.0
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 5, iteration: 1/100, moves: 970, cost: 43306.0
Run 5, iteration: 2/100, moves: 168, cost: 43306.0
Best run was number 5


In [20]:
centroids = km.cluster_centroids_

In [21]:
new_clusters = km.labels_

In [22]:
i = df.columns.values.tolist().index('CO2')
j = df.columns.values.tolist().index('Light')

In [23]:
for indexes in new_clusters:
    plt.scatter(X_train[indexes,i], X_train[indexes,j], alpha=0.4)
for c in centroids:
    plt.scatter(c[i], c[j], s=100, edgecolors='k')

KeyError: (0, 3)

# Rock

https://github.com/annoviko/pyclustering/

In [None]:
from pyclustering.cluster import rock

In [None]:
rc = rock.rock(X_test, eps=0.7, number_clusters=4, threshold=0.5)
rc.process()