### Stage 2: App behaviour is normally distributed

    1) The activity vectors are sampled from a Gaussian distribution
    2) The clusters with normal distributions represent samples generated from a hidden Markov process
    3) To test this, we must first observe whether the data can be split into clusters with Gaussian distributions at a significance level of 0.001
    4) This test will be done via the G-Means algorithm [1]
    
[1] Hamerly, G., & Elkan, C. (n.d.). Learning the k in k -means.


In [1]:
import plotly 

In [18]:
import numpy as np
import itertools

from scipy import linalg
import matplotlib.pyplot as plt
import matplotlib as mpl

from sklearn.mixture import GaussianMixture
from sklearn.preprocessing import scale

from utils import *
from gmm_means_mahalanobis2 import *
import pandas as pd

import plotly.plotly as py
import plotly.graph_objs as go

from sklearn.cluster import KMeans
import pickle
plotly.tools.set_credentials_file(username='asturkmani', api_key='bR0Ez8evmzcHY6m4XG7B')

%load_ext autoreload
%autoreload 2


In [3]:
# time_percentage = 0.95
# explained_variance = 0.9
# df = pd.read_csv("data/rescuetime_data-ac-min.csv")
# data_pd = Clean_DF(df)

# data_pd.clean_data(time_percentage=time_percentage)
# data_pd.clean_df = data_pd.clean_df.reset_index()
# data_pd.get_pca(explained_variance=explained_variance)
# data_pd.get_day_time()

with open('data_pd.pickle', 'rb') as f:  # Python 3: open(..., 'rb')
    data_pd = pickle.load(f)
    
unscaled_data = data_pd.activity_vector
scaled_data = scale(data_pd.activity_vector)

In [4]:
data_pd.clean_df.columns.values

array(['Date', 'Time Spent (seconds)', 'Activity', 'Category',
       'Productivity', 'Activity Vector', 'Productivity Score', 'Day',
       'Time'], dtype=object)

## Visualize Gaussian Mixture Model with unscaled data

In [10]:
# from sklearn.decomposition import KernelPCA
# kpca = KernelPCA(n_components=15, kernel='rbf', degree=5)
# kpca.fit(data_pd.activity_vector)
# pca_kernel = kpca.transform(data_pd.activity_vector)

gmm = GaussianMixture(n_components = 4)
gmm.fit(data_pd.pca_data)

gmm_labels = gmm.predict(data_pd.pca_data)

In [11]:
c = gmm_labels
x = data_pd.pca_data[:,2]
y = data_pd.pca_data[:,3]
z = data_pd.pca_data[:,4]
t = data_pd.clean_df['Activity']
t = data_pd.clean_df['Activity'].tolist()
t = ['-'.join(x) for x in t]
for i in range(0,len(t)):
    t[i] = str(gmm_labels[i]) + '---' + t[i]
    
trace1 = go.Scatter3d(x=x,y=y,z=z,text=t,mode='markers',marker=dict(size=12,color=c, colorscale='Viridis',opacity=0.8))
data = [trace1]
layout = go.Layout(margin=dict(l=0,r=0,b=0,t=0))
fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='GMM PCA unscaled data')

## Visualize Gaussian Mixture Model with scaled PCA data, cov = 'diag'

In [None]:
from sklearn import preprocessing

X = preprocessing.scale(data_pd.pca_data)
gmm_scaled = GaussianMixture(n_components = 4)
gmm_scaled.fit(X)
gmm_scaled_labels = gmm_scaled.predict(X)

In [None]:
c = gmm_scaled_labels
x = data_pd.pca_data[:,2]
y = data_pd.pca_data[:,3]
z = data_pd.pca_data[:,4]
t = data_pd.clean_df['Activity']
t = data_pd.clean_df['Activity'].tolist()
t = ['-'.join(x) for x in t]
for i in range(0,len(t)):
    t[i] = str(gmm_labels[i]) + '---' + t[i]
    
trace1 = go.Scatter3d(x=x,y=y,z=z,text=t,mode='markers',marker=dict(size=12,color=c, colorscale='Viridis',opacity=0.8))
data = [trace1]
layout = go.Layout(margin=dict(l=0,r=0,b=0,t=0))
fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='GMM PCA Scaled data')

In [None]:
data = [go.Scatter(x=data_pd.clean_df.Date, y=gmm_labels)]

py.iplot(data)

In [None]:
unique, counts = np.unique(gmm_labels, return_counts=True)
dict(zip(unique, counts))

In [6]:
with open('tsne_results_pca.pickle', 'rb') as f:  # Python 3: open(..., 'rb')
    tsne_results_pca = pickle.load(f)

### Here GMM-Means is able to find the main clusters, but also distinguishes more between the nuances

In [8]:
gmmm_means_unscaled = GMMMeans(critical_value=1.8692, verbose=1, recalculate_points=False)
gmmm_means_unscaled.fit(unscaled_data)
print("GMM-Means with unscaled data finds: ", len(gmmm_means_unscaled.cluster_centers_)," clusters")

GMM-Means with unscaled data finds:  4  clusters


In [9]:
unique, counts = np.unique(gmmm_means_unscaled.labels_, return_counts=True)
dict(zip(unique, counts))

{0: 11132, 1: 5572}

In [None]:
c = gmmm_means_unscaled.labels_
x = data_pd.pca_data[:,2]
y = data_pd.pca_data[:,3]
z = data_pd.pca_data[:,4]
t = data_pd.clean_df['Activity']
t = data_pd.clean_df['Activity'].tolist()
t = ['-'.join(x) for x in t]
for i in range(0,len(t)):
    t[i] = str(c[i]) + '---' + t[i]
    
trace1 = go.Scatter3d(x=x,y=y,z=z,text=t,mode='markers',marker=dict(size=12,color=c,colorscale='Viridis',opacity=0.8))
data = [trace1]
layout = go.Layout(margin=dict(l=0,r=0,b=0,t=0))
fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='GMM-Means with unscaled data')

In [None]:
gmmm_means_scaled = GMMMeans(critical_value=0.3692, verbose=0, recalculate_points=False)
gmmm_means_scaled.fit(scaled_data)
print("GMM-Means with scaled data finds: ", len(gmmm_means_scaled.cluster_centers_)," clusters")

In [None]:
# unique, counts = np.unique(gmmm_means_scaled.labels_, return_counts=True)
# dict(zip(unique, counts))
gmmm_means_scaled.cluster_centers_

In [None]:
c = gmmm_means_scaled.labels_
x = data_pd.pca_data[:,2]
y = data_pd.pca_data[:,3]
z = data_pd.pca_data[:,4]
t = data_pd.clean_df['Activity']
t = data_pd.clean_df['Activity'].tolist()
t = ['-'.join(x) for x in t]
for i in range(0,len(t)):
    t[i] = str(c[i]) + '---' + t[i]
    
trace1 = go.Scatter3d(x=x,y=y,z=z,text=t,mode='markers',marker=dict(size=12,color=c, colorscale='Viridis', opacity=0.8))
data = [trace1]
layout = go.Layout(margin=dict(l=0,r=0,b=0,t=0))
fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='GMM-Means with scaled data')

In [None]:
gmmpp_means_unscaled = GMMMeans(critical_value=1.1692, verbose=0, recalculate_points=False)
gmmpp_means_unscaled.fit(unscaled_data)
print("GMM-Means with scaled data finds: ", len(gmmpp_means_unscaled.cluster_centers_)," clusters")

init centers:  [[  7.93458951e-02  -1.13761242e-02   1.81765739e-02   1.00206848e-02
    1.52897192e-02   1.01886202e-02   8.31978912e-03   8.90696807e-03
    1.03357895e-02   5.61776545e-03   8.55249795e-03   1.03703580e-02
    1.04479184e-02   3.72186347e-03   4.91098259e-03   3.01690966e-03
    3.74778446e-03   2.99504259e-03   2.73701859e-03   2.49754590e-03
    4.84567235e-03   4.46934968e-03   2.79423651e-03   4.10104678e-03
    4.63470209e-03   2.60723657e-03   2.95625455e-03   2.68593819e-03
    2.41988422e-03   1.85366974e-03   2.88181429e-03   1.91024047e-03
    1.76507658e-03   3.97241315e-03   1.55453789e-03   1.35547010e-03
    1.18659798e-03   2.39533075e-03   1.41167250e-03   1.39267119e-03
    1.15943416e-03   3.00352943e-03   1.03282193e-03   1.18161390e-03
    9.31134794e-04   1.48189163e-03   3.29811884e-04   7.51483646e-04
    6.36062882e-04   1.15267720e-03   1.39769894e-03   4.86672876e-04
    1.23254661e-03   6.17521198e-04   8.71022131e-04   1.06221769e-03
    1

init centers:  [[  1.42498975e-01   2.87145702e-01   7.33700683e-02   9.58505300e-04
    5.56463499e-02   3.89986964e-02   2.73088345e-02   1.06004016e-02
    2.43883550e-02   1.75540107e-05   2.17623312e-02   1.69579693e-02
    9.73477104e-03   4.65922971e-05   4.01935508e-05   7.69731044e-05
    1.41084859e-02   1.12538097e-05   3.78978287e-05   7.39190353e-06
    8.57118440e-03   8.30117314e-03   1.35499188e-05   1.72222518e-03
    6.52046284e-03   6.61594181e-04   3.57046671e-05   8.76822447e-06
    2.22816503e-05   7.69983643e-06   9.15255481e-04   6.03176874e-06
    6.00491418e-06   4.38674223e-03   1.13498199e-05   1.29846424e-05
    1.08172518e-06   7.83239183e-04   1.48005009e-05   3.59629714e-06
    6.77636753e-06   1.71249352e-03   1.45796448e-07   1.06119842e-05
    7.90690009e-06   5.34907952e-04   8.92000842e-66   1.11170046e-06
    0.00000000e+00   6.30641762e-05   2.31003386e-05   2.04239548e-06
    5.75137489e-04   7.49872051e-06   1.89084612e-05   3.98556909e-06
    5

found sub centers [[ 0.          0.          0.          0.          0.          0.          0.
   0.          0.          0.          0.          0.          0.          0.
   0.          0.          0.          0.          0.          0.          0.
   0.          0.          0.          0.          0.          0.          0.
   0.          0.          0.          0.          0.          0.          0.
   0.          0.          0.          0.          0.          0.          0.
   0.          0.          0.          0.          0.          0.          0.
   0.          0.          0.          0.          0.          0.          0.
   0.          0.          0.          0.          0.          0.          0.
   0.          0.          0.          0.          0.          0.          0.
   0.          0.          0.          0.          0.          0.          0.
   0.          0.          0.          0.          0.          0.          0.
   0.          0.          0.          0.     

found sub centers [[  1.18283566e-01   4.69040140e-02   1.15893632e-02   1.07276815e-04
    1.03600765e-03   1.04250135e-04   3.12795940e-05   1.69373078e-04
    9.60937079e-03   6.90520223e-11   3.55409988e-04   7.64381346e-03
    6.74046066e-02   0.00000000e+00   3.95881348e-05   3.95185813e-05
    5.66944905e-06   2.28929183e-18   1.35492235e-04   1.21779859e-09
    3.26228727e-04   2.95557630e-04   1.52863740e-07   1.15471477e-03
    1.03761242e-03   1.18555926e-04   4.04692588e-05   5.66139028e-06
    1.06932859e-10   2.82300395e-05   1.59497829e-04   2.61046284e-10
    0.00000000e+00   5.39287817e-04   0.00000000e+00   2.25820464e-05
    0.00000000e+00   8.46842018e-05   1.32697047e-10   0.00000000e+00
    7.11655677e-05   2.47640793e-04   0.00000000e+00   5.09358574e-11
    7.84538605e-20   8.28596824e-04   0.00000000e+00   6.21006277e-05
    0.00000000e+00   8.83476833e-12   5.15622969e-05   1.12910232e-05
    4.59654543e-10   8.58285764e-10   1.30814253e-14   5.64947433e-06
  

found sub centers [[  1.09172416e-001   1.67797861e-005   2.37996401e-002   4.85664109e-004
    1.31482098e-001   8.96607351e-010   2.74329066e-006   1.93786528e-003
    1.32869736e-002   1.37986556e-135   5.92235351e-002   1.60132312e-002
    1.48219144e-002   8.48024737e-043   5.59326485e-006   3.35595891e-006
    3.50256669e-002   1.45424886e-005   1.95669829e-024   0.00000000e+000
    2.68942156e-002   2.47455736e-002   1.61457274e-003   1.91330077e-003
    6.05255670e-003   1.97425610e-005   7.37506915e-006   1.11865297e-005
    8.94893821e-006   9.36111272e-007   5.92822126e-005   2.12544064e-005
    9.06616311e-029   1.77059296e-002   4.63895321e-048   1.96649885e-022
    2.23730594e-006   2.65155956e-005   4.47158808e-006   4.47453965e-006
    0.00000000e+000   3.42574127e-003   2.77067286e-070   5.16762188e-065
    7.83057079e-006   2.76010664e-005   0.00000000e+000   2.23730594e-006
    0.00000000e+000   4.47461188e-006   7.17052696e-004   0.00000000e+000
    6.82814869e-003 

init centers:  [[  3.18657984e-02   9.93143908e-03   1.54762412e-02   2.31910519e-02
    1.09193769e-02   2.41584216e-02   1.58797088e-02   2.72531674e-02
    7.53405844e-03   1.01436179e-02   4.56697744e-03   4.04850197e-03
    8.04227871e-03   8.28144859e-03   1.73458906e-02   8.72887249e-03
    1.76443262e-03   3.32249973e-03   7.96534932e-03   1.46703931e-02
    5.95679066e-04   3.88774542e-03   3.77996032e-03   1.05241191e-02
    2.98483937e-03   8.72033382e-03   8.15788658e-03   1.24080735e-03
    9.98333287e-03   5.50884753e-03   1.03484094e-02   7.86098045e-04
    1.91032464e-02   7.67089623e-04   3.15843559e-03   3.50984267e-03
    3.17677875e-03   1.78272700e-02   2.76210932e-03   1.59876806e-02
    3.26541584e-04   2.24254811e-02   1.14759462e-02   1.39016056e-02
    1.17910424e-04   4.74848377e-03   1.04308347e-03   8.45990445e-03
    7.06758949e-03   4.45860128e-03   1.06970661e-03   1.28229174e-03
    7.56613770e-04   6.91855191e-03   2.93287192e-03   8.65693078e-04
    1

found sub centers [[  1.02434110e-001   1.53784719e-002   2.43691244e-002   8.22065061e-006
    1.13485977e-001   4.24935104e-005   4.50836872e-008   1.95240452e-005
    4.93422922e-002   2.05516264e-006   1.96297279e-005   3.00418917e-002
    1.91744247e-002   6.16548795e-006   3.49377651e-005   1.23309759e-005
    1.13033946e-005   3.11167848e-036   8.22065061e-006   2.99392272e-027
    2.02640489e-002   3.81017980e-003   3.08274398e-006   9.57059745e-003
    2.64117017e-003   2.46619518e-005   5.74707161e-006   7.19306928e-006
    8.22065061e-006   8.22065061e-006   6.26244234e-005   3.14340786e-027
    0.00000000e+000   1.39218426e-002   0.00000000e+000   2.05516265e-006
    0.00000000e+000   3.21040080e-003   6.82939852e-006   1.02758133e-005
    2.05516265e-006   3.61242738e-003   0.00000000e+000   4.11032530e-006
    4.11032530e-006   8.83123202e-005   0.00000000e+000   2.44813529e-023
    0.00000000e+000   6.16548795e-006   2.77446958e-005   0.00000000e+000
    7.19306928e-006 

found sub centers [[  1.15766905e-001   1.73523789e-013   2.78341896e-002   1.48128775e-024
    2.39154978e-002   4.93010116e-288   0.00000000e+000   1.28850771e-024
    1.35914737e-002   0.00000000e+000   1.14860441e-001   4.35623579e-002
    1.58504093e-002   0.00000000e+000   4.42929118e-006   0.00000000e+000
    6.82324159e-002   1.13622469e-005   0.00000000e+000   0.00000000e+000
    4.95510047e-004   3.95921612e-002   4.46411275e-003   1.42611556e-003
    3.49150280e-002   4.53657167e-043   0.00000000e+000   8.85858235e-006
    0.00000000e+000   1.85325991e-006   1.26439083e-228   4.42929118e-006
    0.00000000e+000   1.91281622e-005   0.00000000e+000   0.00000000e+000
    0.00000000e+000   2.44150138e-036   0.00000000e+000   0.00000000e+000
    1.42196640e-043   6.05477351e-044   0.00000000e+000   3.49078134e-033
    2.60023815e-055   2.80676213e-005   0.00000000e+000   0.00000000e+000
    0.00000000e+000   0.00000000e+000   3.82535972e-007   0.00000000e+000
    1.28182435e-002 

found sub centers [[  2.02257294e-002   2.78678753e-005   0.00000000e+000   1.70566064e-001
    2.32232359e-006   1.88033634e-001   1.29452446e-001   6.14981347e-002
    2.98098791e-005   5.80580898e-005   2.32232359e-005   1.62562652e-005
    3.40169439e-004   6.16362507e-002   4.31712021e-004   4.44920154e-004
    0.00000000e+000   0.00000000e+000   1.26389407e-004   6.96697078e-006
    9.28929437e-006   2.32232359e-005   2.26200350e-006   2.32232359e-006
    4.70379380e-005   2.26152884e-002   1.93154132e-002   0.00000000e+000
    1.70358532e-004   4.87687751e-005   1.45026461e-002   0.00000000e+000
    7.19920314e-005   0.00000000e+000   2.70191548e-003   2.55455545e-005
    2.09009123e-005   2.71638735e-316   1.43273522e-003   6.96697078e-006
    0.00000000e+000   1.59203174e-004   6.96697078e-006   5.49169400e-005
    0.00000000e+000   4.33023095e-003   7.25676001e-132   7.19920314e-005
    0.00000000e+000   3.91098263e-004   0.00000000e+000   6.96697078e-006
    0.00000000e+000 

In [15]:
unique, counts = np.unique(gmmpp_means_unscaled.labels_, return_counts=True)
dict(zip(unique, counts))

{0: 2, 1: 7493, 2: 2443, 3: 6766}

In [16]:
c = gmmpp_means_unscaled.labels_
x = data_pd.pca_data[:,2]
y = data_pd.pca_data[:,3]
z = data_pd.pca_data[:,4]
t = data_pd.clean_df['Activity']
t = data_pd.clean_df['Activity'].tolist()
t = ['-'.join(x) for x in t]
for i in range(0,len(t)):
    t[i] = str(c[i]) + '---' + t[i]
    
trace1 = go.Scatter3d(x=x,y=y,z=z,text=t,mode='markers',marker=dict(size=12,color=c, colorscale='Viridis', opacity=0.8))
data = [trace1]
layout = go.Layout(margin=dict(l=0,r=0,b=0,t=0))
fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='GMM-Means++ with unscaled data')

In [None]:
gmmpp_means_scaled = GMMMeans(critical_value=1.8692, verbose=0, recalculate_points=True)
gmmpp_means_scaled.fit(scaled_data)
print("GMM-Means with scaled data finds: ", len(gmmpp_means_scaled.cluster_centers_)," clusters")
unique, counts = np.unique(gmmpp_means_scaled.labels_, return_counts=True)
dict(zip(unique, counts))

In [None]:
c = gmmpp_means_scaled.labels_
x = data_pd.pca_data[:,2]
y = data_pd.pca_data[:,3]
z = data_pd.pca_data[:,4]
t = data_pd.clean_df['Activity']
t = data_pd.clean_df['Activity'].tolist()
t = ['-'.join(x) for x in t]
for i in range(0,len(t)):
    t[i] = str(c[i]) + '---' + t[i]
    
trace1 = go.Scatter3d(x=x,y=y,z=z,text=t,mode='markers',marker=dict(size=12,color=c, colorscale='Viridis', opacity=0.8))
data = [trace1]
layout = go.Layout(margin=dict(l=0,r=0,b=0,t=0))
fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='GMM-Means++ with scaled data')